TL;DR

The goal of this document is to create a worflow to cluster scRNASeq data with an example on olfactory data. It is not meant to be the final document where we would expect more written explanations. It is meant to choose the functions, parameters, and plots we want to show. Another goal of this document is also to see how we could use clusterExperiment in combinaison with zinbwave.

load('../data/Expt4c2b_filtdata.Rda')
load('../data/E4c2b_slingshot_wsforkelly.RData')
load('olfactory.rda')

Let’s create a SummarizedExperiment object. In colData, we add a column for the batches and another column for the clus.labels.

Could you remind me where are the clus.labels from? I think we should have a more explicit name, like bio_condition or paper_clusters.

names(batch) <- colnames(counts)
counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]
qc$batch = batch
qc$clusters = clus.labels

se = SummarizedExperiment(assays = list(counts = counts),
                          colData = qc)

assay(se)[1:2,1:5]
##       OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507 OEP01_N705_S501
## CreER               1            3022            2329             717
## Xkr4                0               0               0              14
##       OEP01_N702_S505
## CreER            7007
## Xkr4                0
colData(se)[,1:5]
## DataFrame with 616 rows and 5 columns
##                    NREADS  NALIGNED    RALIGN TOTAL_DUP    PRIMER
##                 <numeric> <numeric> <numeric> <numeric> <numeric>
## OEP01_N706_S501   3313260   3167600   95.6035   47.9943 0.0154566
## OEP01_N701_S501   2902430   2757790   95.0167   45.0150 0.0182066
## OEP01_N707_S507   2307940   2178350   94.3852   43.7832 0.0219196
## OEP01_N705_S501   3337400   3183720   95.3952   43.2688 0.0183041
## OEP01_N702_S505   2984830   2844350   95.2935   66.3881 0.0190047
## ...                   ...       ...       ...       ...       ...
## OEL23_N701_S511   2195310   2083280   94.8966   52.6111 0.0208372
## OEL23_N701_S508    928692    829078   89.2737   49.0569 0.0622330
## OEL23_N706_S502   2215640   2108490   95.1637   50.6643 0.0182207
## OEL23_N704_S503   2673790   2568300   96.0546   60.5481 0.0155611
## OEL23_N703_S502   2450320   2363500   96.4567   48.4164 0.0122704

Let’s filter out genes with too many zeros and normalize counts using FQ limma normalization.

pass_filter = apply(assay(se), 1, function(x) length(x[x >= 10]) >= 10)
se <- se[pass_filter,]
dim(se)
## [1] 12466   616
fq <- round(limma::normalizeQuantiles(assay(se)))
assays(se) <- list(normalized_counts=fq)

clusterMany

We included the clus.labels and batch. It is wanted that the cells don’t cluster by batch.

I think this is where we could include zinbwave. We could add zinbwave as dimReduce in clusterExperiment with an argument similar to nPCADims (something like nZINBDims) which would correspond to the K in the W matrix. We could then skip the normalization of the counts using limma (see above). We would need to set isCount = FALSE. Thoughts?

ce <- clusterMany(se,
                  clusterFunction = "pam",
                  ks = seq(9, 15, 3),
                  isCount = TRUE,
                  dimReduce = c("PCA", "var"),
                  nVarDims = c(100, 500, 1000),
                  nPCADims = c(5, 15, 50),
                  run = TRUE)

defaultMar <- par("mar")
plotCMar <- c(1.1, 16.1, 4.1, 1.1)
par(mar = plotCMar)
plotClusters(ce,
             main = "Clusters from clusterMany",
             axisLine = -1,
             sampleData=c("batch","clusters"))

Remove ‘features’ from the clusters labels to have more succint names.

cl = clusterLabels(ce)
cl = gsub("Features", "", cl)
clusterLabels(ce) = cl

Re-order the clusters by dimension and not by k.

cl = clusterLabels(ce)
ndims = sapply(cl, function(x) strsplit(x, '=|,')[[1]][2])
ord <- order(as.numeric(ndims))
par(mar = plotCMar)
plotClusters(ce,
             main = "Clusters from clusterMany",
             whichClusters = ord, axisLine = -1,
             sampleData=c("batch","clusters"))

The output matrix of clusterMany is

clusterMatrix(ce)[1:3,1:3]
##      nVAR=100,k=9 nVAR=500,k=9 nVAR=1000,k=9
## [1,]            1            1             1
## [2,]            1            1             1
## [3,]            1            2             1

CombineMany

ce = combineMany(ce)
## Note: no clusters specified to combine, using results from clusterMany
head(clusterMatrix(ce)[,1:3])
##      combineMany nVAR=100,k=9 nVAR=500,k=9
## [1,]           1            1            1
## [2,]           2            1            1
## [3,]          -1            1            2
## [4,]          -1            1            2
## [5,]          -1            1            1
## [6,]           1            1            1

Look at the new row added for combineMany

par(mar = plotCMar)
plotClusters(ce, whichClusters = "workflow",
             axisLine = -1, main = "Clusters from clusterMany",
             sampleData=c("batch","clusters"))

clusterMany requires samples to be in the same cluster in every clustering to be assigned a cluster. We need to change the minimum proportion of times they should be together with other samples in the cluster they are assigned to.

wh = which(clusterLabels(ce) == "combineMany")
if (length(wh) != 1){
  stop()
}else{
  clusterLabels(ce)[wh] = "combineMany,default"
} 
# change proportion
ce = combineMany(ce, proportion = 0.7,
                 clusterLabel = "combineMany,0.7")
## Note: no clusters specified to combine, using results from clusterMany
# plot
par(mar=plotCMar)
plotClusters(ce, whichClusters = "workflow",
             axisLine = -1, main = "Clusters from clusterMany",
             sampleData=c("batch","clusters"))

We can also play with the minSize parameter.

# change minsize
ce = combineMany(ce, proportion = 0.7, minSize = 5, 
                 clusterLabel = "combineMany,final")
## Note: no clusters specified to combine, using results from clusterMany
# plot
par(mar=plotCMar)
plotClusters(ce, whichClusters = "workflow",
             axisLine = -1, main = "Clusters from clusterMany",
             sampleData=c("batch","clusters"))

The proportion of times these clusters were together across these clusterings

plotCoClustering(ce)

mergeClusters

Cluster hierarchy for combineMany,final where before clustering PCA is performed and 15 most variable genes are kept

# makeDendrogram
ce = makeDendrogram(ce, whichCluster = "primaryCluster",
                    dimReduce = "PCA", ndims = 15)
plotDendrogram(ce)

We have not merged the clusters yet, but we have made the dendrogram

ce
## class: ClusterExperiment 
## dim: 12466 616 
## Primary cluster type: combineMany 
## Primary cluster label: combineMany,final 
## Table of clusters (of primary clustering):
##  -1  c1 c10 c11 c12 c13 c14 c15 c16 c17 c18 c19  c2 c20 c21 c22 c23 c24 
## 121  37   6  12  10  15  11  13  31  21   8  37  20  25  26   5   5  20 
## c25 c26 c27 c28 c29  c3 c30 c31 c32 c33 c34  c4  c5  c6  c7  c8  c9 
##   6   7   6  31  10   8   7   6   9   5   6   7  10  22  28  14  11 
## Total number of clusterings: 21 
## Dendrogram run on 'combineMany,final' (cluster index: 1)
## -----------
## Workflow progress:
## clusterMany run? Yes 
## combineMany run? Yes 
## makeDendrogram run? Yes 
## mergeClusters run? No

Before to actually merge the clusters, we are going to visualize the merge at the default cutoff (0.1)

mergeClusters(ce, mergeMethod = "adjP", plot = "mergeMethod")
## Note: Merging will be done on ' combineMany,final ', with clustering index 1

ce = mergeClusters(ce, mergeMethod = "adjP", cutoff = 0.1)
## Note: Merging will be done on ' combineMany,final ', with clustering index 1
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 17.2. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 16.3. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 8. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 5.6. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 8.8. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 5.3. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 11.8. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 3.8. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 30.3. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 14.8. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 12.5. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 23.3. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 5.5. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 3. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 6.3. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 6.6. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 79.4. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 4. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 6.8. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 8.7. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 16.2. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 12.8. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 24.8. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 21.5. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 60.2. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 37.5. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 139.3. Rerun
## with increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 8.9. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 46.9. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 56. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 17.9. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 11.8. Rerun with
## increased df
## Warning in locfdr::locfdr(tstats, plot = 0): f(z) misfit = 10.4. Rerun with
## increased df

par(mar = plotCMar)
plotClusters(ce, whichClusters = "workflow", 
             axisLine = -1, main = "Clusters from clusterMany",
             sampleData=c("batch","clusters"))

plotCoClustering(ce, sampleData = c("clusters"),
                 whichClusters = c("mergeClusters", 
                                   "combineMany"),
                 annLegend = FALSE)

plotHeatmap(ce, clusterSamplesData = "dendrogramValue",
            breaks = .99,
            sampleData = c("clusters"))

getBestFeatures

DE genes between clusters using limma voom

pairsAll = getBestFeatures(ce, contrastType = "Pairs", 
                           p.value = 0.05,
                           number = 10,
                           isCount = TRUE)
## Note: If `isCount=TRUE` the data will be transformed with voom() rather than
## with the transformation function in the slot `transformation`.
## This makes sense only for counts.
head(pairsAll)
##   IndexInOriginal Contrast Feature     logFC   AveExpr         t
## 1           11657    X1-X2   Pola2 -4.392638 5.0490435 -16.25181
## 2           12437    X1-X2    TrnD  4.657239 0.8521260  14.88774
## 3           11961    X1-X2 Col17a1  5.134028 1.2865274  14.79994
## 4            8151    X1-X2   Krt14  6.776355 2.0735590  14.69069
## 5            6808    X1-X2  Gm9385  4.534598 0.8588438  14.05505
## 6            8536    X1-X2    Agr2 -4.711054 3.6779512 -13.78009
##        P.Value    adj.P.Val        B
## 1 4.441282e-48 1.013216e-45 97.85369
## 2 7.996011e-42 1.410304e-39 83.94854
## 3 1.989384e-41 3.437577e-39 82.85541
## 4 6.166364e-41 1.035783e-38 81.78786
## 5 4.159895e-38 6.134360e-36 75.53614
## 6 6.705632e-37 9.306511e-35 72.86724
length(pairsAll$Feature) == length(unique(pairsAll$Feature))
## [1] FALSE
main = "Heatmap of features w/ significant pairwise differences"
clusterFeaturesData = unique(pairsAll[,"IndexInOriginal"])
plotHeatmap(ce, clusterSamplesData = "dendrogramValue",
            clusterFeaturesData = clusterFeaturesData,
            main = main, breaks = .99)